rm(list = ls())
library(tidyverse, quietly = T, verbose = F)
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# library(tibble)
# library(janitor)
library(wesanderson, quietly = T, verbose = F)
# library(biomaRt)
library(patchwork, quietly = T, verbose = F)
# library(depmap)
# library(Ipaper)
library(tidysq, quietly = T, verbose = F) #read fasta file
##
## Attaching package: 'tidysq'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## The following object is masked from 'package:base':
##
## paste
library(readxl, quietly = T, verbose = F)
library(IRanges, quietly = T, verbose = F)
##
## Attaching package: 'BiocGenerics'
##
## The following object is masked from 'package:tidysq':
##
## paste
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
##
## Attaching package: 'S4Vectors'
##
## The following objects are masked from 'package:lubridate':
##
## second, second<-
##
## The following objects are masked from 'package:dplyr':
##
## first, rename
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
##
## Attaching package: 'IRanges'
##
## The following objects are masked from 'package:tidysq':
##
## collapse, reverse
##
## The following object is masked from 'package:lubridate':
##
## %within%
##
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## The following object is masked from 'package:purrr':
##
## reduce
source("00_functions_2024.R")
enst_expr <- read_rds("../results/01_wrangled_data/01_enst_expr.rds")
ensg_expr <- read_rds("../results/01_wrangled_data/01_ensg_expr.rds")
load(file = "../data/gte.Rdata") # transcripts for each gene + genomic coordinates
pheno_nh <- read_rds("../results/01_wrangled_data/01_pheno_no_haematological_cancers.rds")
pheno <- read_rds("../data/pheno.rds")
hgnc_from_ensembl <- readRDS(file = "../results/00_hgnc_from_ensembl_biomart.rds")
list_of_categories_nh <- unique(pheno_nh$TCGA_GTEX_main_category)
list_of_categories <- unique(pheno$TCGA_GTEX_main_category)
# # find the gene in ensg_exp with the highest number of isoforms in order to
# # decide the dimensions of the boxplot
#
# l <- vector(mode = "numeric", length = nrow(ensg_expr))
# for(i in 1:nrow(ensg_expr)) {
# ensg <- rownames(ensg_expr)[i]
# l[i] <- length(names(gte_list[[ensg]]))
#
# maxl <- (max(l))
#
# }
list_of_categories
## [1] "uterus"
## [2] "testis"
## [3] "stomach"
## [4] "skin"
## [5] "prostate"
## [6] "pancreas"
## [7] "ovary"
## [8] "lung"
## [9] "liver"
## [10] "kidney"
## [11] "brain"
## [12] "esophagus"
## [13] "colon"
## [14] "breast"
## [15] "bladder"
## [16] "cervix uteri"
## [17] "fallopian tube"
## [18] "blood"
## [19] "blood vessel"
## [20] "pituitary"
## [21] "heart"
## [22] "nerve"
## [23] "small intestine"
## [24] "spleen"
## [25] "thyroid"
## [26] "adrenal gland"
## [27] "salivary gland"
## [28] "vagina"
## [29] "adipose tissue"
## [30] "muscle"
## [31] "uveal melanoma"
## [32] "uterine corpus endometrioid carcinoma"
## [33] "uterine carcinosarcoma"
## [34] "thyroid carcinoma"
## [35] "thymoma"
## [36] "testicular germ cell tumor"
## [37] "stomach adenocarcinoma"
## [38] "skin cutaneous melanoma"
## [39] "sarcoma"
## [40] "rectum adenocarcinoma"
## [41] "prostate adenocarcinoma"
## [42] "pheochromocytoma & paraganglioma"
## [43] "pancreatic adenocarcinoma"
## [44] "ovarian serous cystadenocarcinoma"
## [45] "mesothelioma"
## [46] "lung squamous cell carcinoma"
## [47] "lung adenocarcinoma"
## [48] "liver hepatocellular carcinoma"
## [49] "kidney papillary cell carcinoma"
## [50] "kidney clear cell carcinoma"
## [51] "kidney chromophobe"
## [52] "head & neck squamous cell carcinoma"
## [53] "glioblastoma multiforme"
## [54] "esophageal carcinoma"
## [55] "diffuse large b-cell lymphoma"
## [56] "colon adenocarcinoma"
## [57] "cholangiocarcinoma"
## [58] "cervical & endocervical cancer"
## [59] "breast invasive carcinoma"
## [60] "brain lower grade glioma"
## [61] "bladder urothelial carcinoma"
## [62] "adrenocortical cancer"
## [63] "acute myeloid leukemia"
EGFR
Isoforms expression
# to view the categories, print list_of_categories
# import transcript data from ensembl website --> I want to keep only the protein coding transcripts
EGFR_transcripts <- read_xlsx("../data/isoforms/transcripts_EGFR.xlsx",
col_names = TRUE, skip = 1) # skip = 1 is necessary to have colnames
# select the tissues and cancer of interest and find isoforms expression in those tissues
EGFR_isoforms <- isoforms_selected_tissues(hgnc = "EGFR", TCGA_interest = c("glioblastoma multiforme", "lung squamous cell carcinoma", "lung adenocarcinoma", "breast invasive carcinoma", "stomach adenocarcinoma", "liver hepatocellular carcinoma"), GTEX_interest = c("heart", "kidney", "liver", "lung"), gte_list = gte_list, pheno = pheno_nh, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = EGFR_transcripts)
# plot the tcga and gtex of interest
lt <- EGFR_isoforms$gtex$enst_id %>% unique() %>% length()
palette <- wes_palette("FantasticFox1", lt, "continuous")
plot_tcga <- ggplot(EGFR_isoforms$tcga,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(EGFR_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
legend.text = element_text(size = 38),
legend.title = element_text(size = 50),
plot.margin = unit(c(1,1,1,10), "cm"),
legend.position = "bottom") +
ggtitle(label = "EGFR expression for TCGA cancers")
plot_gtex <- ggplot(EGFR_isoforms$gtex,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(EGFR_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
# legend.text = element_text(size = 30),
# legend.title = element_text(size = 35),
plot.margin = unit(c(1,6,1,4), "cm"),
legend.position = "none") +
ggtitle(label = paste("EGFR expression for GTEX normal tissues"))
(plot <- plot_tcga + plot_gtex)

ggsave(filename = "EGFR_isoforms_expression_selected_tissues_plot.pdf", plot = plot, device = "pdf",
path = "../results/plots/isoform_expression/",
width = 120, height = 55, units = "cm", limitsize = F)
Protein properties
deeptmhmm_output <- read.table("../results/Deeptmhmm/canonical_and_isoforms/EGFR", fill = T)
# import object (created using the msa script)
aligned_sequences <- read_fasta(paste("../results/aligned_sequences/EGFR_msa.txt"))
(EGFR_topology <- protein_properties(hgnc = "EGFR", hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = EGFR_transcripts, deeptmhmm_output = deeptmhmm_output, aligned_sequences = aligned_sequences))

Cellular localization
Epitope colocalization
CD7
Isoforms expression
# to view the categories, print list_of_categories
# import transcript data from ensembl website --> I want to keep only the protein coding transcripts
CD7_transcripts <- read_xlsx("../data/isoforms/transcripts_CD7.xlsx",
col_names = TRUE, skip = 1) # skip = 1 is necessary to have colnames
# select the tissues and cancer of interest and find isoforms expression in those tissues
CD7_isoforms <- isoforms_selected_tissues(hgnc = "CD7", TCGA_interest = c("acute myeloid leukemia", "diffuse large b-cell lymphoma"), GTEX_interest = c("lung", "small intestine", "spleen", "stomach"), gte_list = gte_list, pheno = pheno, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = CD7_transcripts)
# (plot <- boxplot_isoforms_all_tissues(hgnc = "CD7", gte_list = gte_list, pheno = pheno, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl))
# plot the tcga and gtex of interest
lt <- CD7_isoforms$gtex$enst_id %>% unique() %>% length()
palette <- wes_palette("FantasticFox1", lt, "continuous")
plot_tcga <- ggplot(CD7_isoforms$tcga,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(CD7_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
legend.text = element_text(size = 30),
legend.title = element_text(size = 35),
plot.margin = unit(c(1,10,1,10), "cm"),
legend.position = "bottom") +
ggtitle(label = "CD7 expression for TCGA cancers")
plot_gtex <- ggplot(CD7_isoforms$gtex,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(CD7_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
# legend.text = element_text(size = 30),
# legend.title = element_text(size = 35),
plot.margin = unit(c(1,6,1,4), "cm"),
legend.position = "none") +
ggtitle(label = paste("CD7 expression for GTEX normal tissues"))
(plot <- plot_tcga + plot_gtex)

ggsave(filename = "CD7_isoforms_expression_selected_tissues_plot.pdf", plot = plot, device = "pdf",
path = "../results/plots/isoform_expression/",
width = 120, height = 55, units = "cm", limitsize = F)
Protein properties
deeptmhmm_output <- read.table("../results/Deeptmhmm/canonical_and_isoforms/CD7", fill = T)
# import object (created using the msa script)
aligned_sequences <- read_fasta(paste("../results/aligned_sequences/CD7_msa.txt"))
(CD7_topology <- protein_properties(hgnc = "CD7", hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = CD7_transcripts, deeptmhmm_output = deeptmhmm_output, aligned_sequences = aligned_sequences))

MSLN
Isoforms expression
# to view the categories, print list_of_categories
# import transcript data from ensembl website --> I want to keep only the protein coding transcripts
MSLN_transcripts <- read_xlsx("../data/isoforms/transcripts_MSLN.xlsx",
col_names = TRUE, skip = 1) # skip = 1 is necessary to have colnames
# select the tissues and cancer of interest and find isoforms expression in those tissues
MSLN_isoforms <- isoforms_selected_tissues(hgnc = "MSLN", TCGA_interest = c("lung squamous cell carcinoma", "mesothelioma", "colon adenocarcinoma", "rectum adenocarcinoma", "pancreatic adenocarcinoma", "ovarian serous cystadenocarcinoma", "stomach adenocarcinoma", "cholangiocarcinoma", "cervical & endocervical cancer", "uterine corpus endometrioid carcinoma", "breast invasive carcinoma"), GTEX_interest = c("lung", "kidney", "uterus", "fallopian tube", "heart"), gte_list = gte_list, pheno = pheno_nh, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = MSLN_transcripts)
# (plot <- boxplot_isoforms_all_tissues(hgnc = "MSLN", gte_list = gte_list, pheno = pheno_nh, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl))
# plot the tcga and gtex of interest
lt <- MSLN_isoforms$gtex$enst_id %>% unique() %>% length()
palette <- wes_palette("FantasticFox1", lt, "continuous")
plot_tcga <- ggplot(MSLN_isoforms$tcga,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(MSLN_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
legend.text = element_text(size = 30),
legend.title = element_text(size = 32),
plot.margin = unit(c(1,6,1,10), "cm"),
legend.position = "bottom") +
ggtitle(label = "MSLN expression for TCGA cancers")
plot_gtex <- ggplot(MSLN_isoforms$gtex,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(MSLN_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
# legend.text = element_text(size = 30),
# legend.title = element_text(size = 35),
plot.margin = unit(c(1,6,1,4), "cm"),
legend.position = "none") +
ggtitle(label = paste("MSLN expression for GTEX normal tissues"))
(plot <- plot_tcga + plot_gtex + plot_layout(widths = c(2, 1)))

ggsave(filename = "MSLN_isoforms_expression_selected_tissues_plot.pdf", plot = plot, device = "pdf",
path = "../results/plots/isoform_expression/",
width = 140, height = 55, units = "cm", limitsize = F)
Protein properties
deeptmhmm_output <- read.table("../results/Deeptmhmm/canonical_and_isoforms/MSLN", fill = T)
# import object (created using the msa script)
aligned_sequences <- read_fasta("../results/aligned_sequences/MSLN_msa.txt")
(MSLN_topology <- protein_properties(hgnc = "MSLN", hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = MSLN_transcripts, deeptmhmm_output = deeptmhmm_output, aligned_sequences = aligned_sequences))

CD19
Isoforms expression
# to view the categories, print list_of_categories
# import transcript data from ensembl website --> I want to keep only the protein coding transcripts
CD19_transcripts <- read_xlsx("../data/isoforms/transcripts_CD19.xlsx",
col_names = TRUE, skip = 1) # skip = 1 is necessary to have colnames
# select the tissues and cancer of interest and find isoforms expression in those tissues
CD19_isoforms <- isoforms_selected_tissues(hgnc = "CD19", TCGA_interest = c("acute myeloid leukemia", "diffuse large b-cell lymphoma", "glioblastoma multiforme", "uveal melanoma", "pheochromocytoma & paraganglioma", "adrenocortical cancer", "thyroid carcinoma", "breast invasive carcinoma", "lung squamous cell carcinoma", "lung adenocarcinoma", "sarcoma", "pancreatic adenocarcinoma"), GTEX_interest = c("small intestine", "spleen", "stomach", "testis"), gte_list = gte_list, pheno = pheno_nh, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = CD19_transcripts)
# (plot <- boxplot_isoforms_all_tissues(hgnc = "CD19", gte_list = gte_list, pheno = pheno, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl))
# plot the tcga and gtex of interest
lt <- CD19_isoforms$gtex$enst_id %>% unique() %>% length()
palette <- wes_palette("FantasticFox1", lt, "continuous")
plot_tcga <- ggplot(CD19_isoforms$tcga,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(CD19_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
legend.text = element_text(size = 30),
legend.title = element_text(size = 32),
plot.margin = unit(c(1,6,1,10), "cm"),
legend.position = "bottom") +
ggtitle(label = "CD19 expression for TCGA cancers")
plot_gtex <- ggplot(CD19_isoforms$gtex,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(CD19_isoforms$gtex$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
# legend.text = element_text(size = 30),
# legend.title = element_text(size = 35),
plot.margin = unit(c(1,6,1,4), "cm"),
legend.position = "none") +
ggtitle(label = paste("CD19 expression for GTEX normal tissues"))
(plot <- plot_tcga + plot_gtex + plot_layout(widths = c(2, 1)))

ggsave(filename = "CD19_isoforms_expression_selected_tissues_plot.pdf", plot = plot, device = "pdf",
path = "../results/plots/isoform_expression/",
width = 120, height = 55, units = "cm", limitsize = F)
Protein properties
deeptmhmm_output <- read.table("../results/Deeptmhmm/canonical_and_isoforms/CD19", fill = T)
# import object (created using the msa script)
aligned_sequences <- read_fasta("../results/aligned_sequences/CD19_msa.txt")
(CD19_topology <- protein_properties(hgnc = "CD19", hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = CD19_transcripts, deeptmhmm_output = deeptmhmm_output, aligned_sequences = aligned_sequences))
